# Immigration and the top 1%

# characteristics of joiners, leavers, and stayers: AGE


# # # #

## calculate distribution of YOA by joiner or stayer

# pick base year
base_year <- 2017

# pick top share for file name
ts_vec <- c('1', '01')


# loop over top shares
tab <- map_dfr(ts_vec, function(ts_string) {
  # upload churn data
  data <-
    read_fst(
      paste('output/',ts_string,'_base_year',base_year,'.fst',
        sep = ""
      )
    )
  
  # tibble
  data <- as_tibble(data)
  
  # numeric
  data <- data %>%
    mutate_at(vars(tyob), 'as.numeric')
  
  
  # # # #
  
  ## define groups: joiner and stayer
  data <- data %>%
    mutate(
      group_persist = case_when(
        year_base == 1 & year_minus_1 == 1 ~ 'stayer',
        year_base == 1 &
          year_minus_1 == 0 ~ 'joiner',
        TRUE ~ NA_character_
      )
    )
  
  
  # # # #
  
  ## calculate age
  data <- data %>%
    mutate(age = base_year - tyob)
  
  # remove if age is below 18
  data <- data %>%
    filter(age >= 18 & age <= 100)
  
  # age groups
  if (ts_string == '1'){
    age_groups <- c(0, 22, 24, 25, 26:70, 80)
  } else if (ts_string == '01') {
    age_groups <- c(0, 25, 30, 35:60, 65, 70, 80)
  }
    
  # create groups of age
  data <- data %>%
    mutate(age_grouped = cut(age, breaks = age_groups, labels = F, include.lowest = T)) %>%
    group_by(age_grouped) %>%
    mutate(age_grouped_max = max(age)) %>%
    ungroup()
  
  # remove NAs
  data <- data %>%
    filter(!is.na(age_grouped))
  
  # counts
  dgraph <- data %>%
    group_by(age_grouped_max, group_persist, migrant_comb) %>%
    summarise(n=n())
  
  # remove if outside the range
  data <- data %>%
    filter(age_grouped_max < 100)
  
  # totals
  tot <- dgraph %>%
    group_by(group_persist, migrant_comb) %>%
    summarise(group_tot = sum(n, na.rm = T))
  
  
  # merge total back
  dgraph <- left_join(dgraph, tot, by = c('group_persist', 'migrant_comb'))
  
  # calculate proportion
  dgraph <- dgraph %>%
    mutate(prop = n/group_tot)
  
  
  # plot
  dgraph %>%
    filter(age_grouped_max <= 80) %>%
    ggplot(., aes(age_grouped_max, fill = migrant_comb)) +
    geom_col(position = 'dodge', aes(y = prop)) +
    scale_fill_manual(values = c('navy', 'dodgerblue')) +
    xlab('Age') +
    ylab('Proportion') +
    theme_minimal() +
    theme(legend.position = 'bottom') +
    facet_grid(. ~ group_persist)
  
  # save graph
  file_name <-
    paste('age_Top', ts_string, '_base_year', base_year, '.pdf', sep = '')
  
  # ggsave
  ggsave(
    filename = file_name,
    path = 'output/',
    dpi = 'retina',
    height = 9,
    width = 16,
    units = 'cm'
  )
  
  
  # top share as numeric
  if (ts_string == '1') {
    ts <- .01
  } else if (ts_string == '01') {
    ts <- .001
  } else if (ts_string == '001') {
    ts <- .0001
  }
  
  # add top share variable
  dgraph <- dgraph %>%
    mutate(ts = ts)
  
  # return
  return(dgraph)
  
})


min(tab$n)

# # # #

# write csv
write_csv(tab, paste('output/', base_year, '.csv',
          sep = ""))
